{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "word2idx = {'<pad>': 0}\n",
    "tag2idx = {'<pad>': 0}\n",
    "word_idx = 1\n",
    "tag_idx = 1\n",
    "x_train = []\n",
    "y_train = []\n",
    "x_test = []\n",
    "y_test = []\n",
    "\n",
    "for line in open('pos_train.txt'):\n",
    "    line = line.rstrip()\n",
    "    if line:\n",
    "        word, tag, _ = line.split()\n",
    "        if word not in word2idx:\n",
    "            word2idx[word] = word_idx\n",
    "            word_idx += 1\n",
    "        x_train.append(word2idx[word])\n",
    "        if tag not in tag2idx:\n",
    "            tag2idx[tag] = tag_idx\n",
    "            tag_idx += 1\n",
    "        y_train.append(tag2idx[tag])\n",
    "        \n",
    "word2idx['<unknown>'] = word_idx\n",
    "\n",
    "for line in open('pos_test.txt'):\n",
    "    line = line.rstrip()\n",
    "    if line:\n",
    "        word, tag, _ = line.split()\n",
    "        if word in word2idx:\n",
    "            x_test.append(word2idx[word])\n",
    "        else:\n",
    "            x_test.append(word_idx)\n",
    "        y_test.append(tag2idx[tag])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    'seq_len': 20,\n",
    "    'batch_size': 128,\n",
    "    'hidden_dim': 128,\n",
    "    'clip_norm': 5.0,\n",
    "    'text_iter_step': 1,\n",
    "    'lr': {'start': 5e-3, 'end': 5e-4},\n",
    "    'n_epoch': 1,\n",
    "    'display_step': 50,\n",
    "    'vocab_size': len(word2idx),\n",
    "    'n_class':tag_idx\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def iter_seq(x):\n",
    "    return np.array([x[i: i+params['seq_len']] for i in range(0, len(x)-params['seq_len'], params['text_iter_step'])])\n",
    "\n",
    "def to_train_seq(*args):\n",
    "    return [iter_seq(x) for x in args]\n",
    "\n",
    "def to_test_seq(*args):\n",
    "    return [np.reshape(x[:(len(x)-len(x)%params['seq_len'])],\n",
    "        [-1,params['seq_len']]) for x in args]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, Y_train = to_train_seq(x_train, y_train)\n",
    "X_test, Y_test = to_test_seq(x_test, y_test)\n",
    "params['lr']['steps'] = len(X_train) // params['batch_size']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model:\n",
    "    def __init__(self):\n",
    "        self.X = tf.placeholder(tf.int32, [None, params['seq_len']])\n",
    "        self.Y = tf.placeholder(tf.int32, [None, params['seq_len']])\n",
    "        \n",
    "        def rnn_cell():\n",
    "            return tf.nn.rnn_cell.GRUCell(params['hidden_dim'],\n",
    "                                          kernel_initializer=tf.orthogonal_initializer())\n",
    "        \n",
    "        def clip_grads(loss):\n",
    "            variables = tf.trainable_variables()\n",
    "            grads = tf.gradients(loss, variables)\n",
    "            clipped_grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])\n",
    "            return zip(clipped_grads, variables)\n",
    "        \n",
    "        self.embedding = tf.Variable(tf.truncated_normal([params['vocab_size'], params['hidden_dim']],\n",
    "                                                      stddev=1.0 / np.sqrt(params['hidden_dim'])))\n",
    "        embedded = tf.nn.embedding_lookup(self.embedding, self.X)\n",
    "        embedded = tf.nn.dropout(embedded,0.1)\n",
    "        bi_outputs, _ = tf.nn.bidirectional_dynamic_rnn(\n",
    "            rnn_cell(), rnn_cell(), embedded, dtype=tf.float32)\n",
    "        x = tf.concat(bi_outputs, -1)\n",
    "        self.logits = tf.layers.dense(x, params['n_class'])\n",
    "        log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(\n",
    "        self.logits, self.Y, tf.count_nonzero(self.X, 1))\n",
    "        self.cost = tf.reduce_mean(-log_likelihood)\n",
    "        self.global_step = tf.Variable(0, trainable=False)\n",
    "        self.learning_rate = tf.train.exponential_decay(params['lr']['start'],\n",
    "                                                        self.global_step, params['lr']['steps'],\n",
    "                                                        params['lr']['end']/params['lr']['start'])\n",
    "        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(clip_grads(self.cost), \n",
    "                                                                                    global_step=self.global_step)\n",
    "        self.crf_decode = tf.contrib.crf.crf_decode(self.logits, \n",
    "                                                    trans_params, \n",
    "                                                    tf.count_nonzero(self.X, 1))[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gradients_impl.py:96: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
     ]
    }
   ],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = Model()\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 1, step 1, loss 77.127289\n",
      "epoch 1, step 50, loss 26.483004\n",
      "epoch 1, step 100, loss 17.215683\n",
      "epoch 1, step 150, loss 14.920134\n",
      "epoch 1, step 200, loss 16.038221\n",
      "epoch 1, step 250, loss 9.415933\n",
      "epoch 1, step 300, loss 9.295267\n",
      "epoch 1, step 350, loss 12.208104\n",
      "epoch 1, step 400, loss 9.211021\n",
      "epoch 1, step 450, loss 8.099480\n",
      "epoch 1, step 500, loss 8.903303\n",
      "epoch 1, step 550, loss 7.738531\n",
      "epoch 1, step 600, loss 10.155366\n",
      "epoch 1, step 650, loss 7.592325\n",
      "epoch 1, step 700, loss 3.969016\n",
      "epoch 1, step 750, loss 3.658270\n",
      "epoch 1, step 800, loss 7.450172\n",
      "epoch 1, step 850, loss 7.433585\n",
      "epoch 1, step 900, loss 7.305661\n",
      "epoch 1, step 950, loss 7.443968\n",
      "epoch 1, step 1000, loss 1.441902\n",
      "epoch 1, step 1050, loss 6.485475\n",
      "epoch 1, step 1100, loss 4.171388\n",
      "epoch 1, step 1150, loss 5.776435\n",
      "epoch 1, step 1200, loss 5.577882\n",
      "epoch 1, step 1250, loss 4.656260\n",
      "epoch 1, step 1300, loss 7.389355\n",
      "epoch 1, step 1350, loss 3.161110\n",
      "epoch 1, step 1400, loss 4.540032\n",
      "epoch 1, step 1450, loss 2.491982\n",
      "epoch 1, step 1500, loss 3.317016\n",
      "epoch 1, step 1550, loss 12.681192\n",
      "epoch 1, step 1600, loss 3.148616\n",
      "epoch 1, step 1650, loss 1.940303\n",
      "epoch 1, avg loss 10.018621\n"
     ]
    }
   ],
   "source": [
    "for i in range(params['n_epoch']):\n",
    "    total_cost = 0\n",
    "    for k in range(0,(X_train.shape[0] // params['batch_size'])*params['batch_size'],params['batch_size']):\n",
    "        batch_x = X_train[k:k+params['batch_size']]\n",
    "        batch_y = Y_train[k:k+params['batch_size']]\n",
    "        step, loss, _ = sess.run([model.global_step, model.cost, model.optimizer],\n",
    "                                feed_dict={model.X:batch_x, model.Y:batch_y})\n",
    "        if step % params['display_step'] == 0 or step == 1:\n",
    "            print('epoch %d, step %d, loss %f'%(i+1,step,loss))\n",
    "        total_cost += loss\n",
    "    total_cost /= ((X_train.shape[0] // params['batch_size']))\n",
    "    print('epoch %d, avg loss %f'%(i+1,total_cost))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "        NNP       0.87      0.90      0.89      6639\n",
      "        VBZ       0.97      0.99      0.98      5070\n",
      "        WRB       0.99      0.99      0.99      4020\n",
      "         WP       0.94      0.86      0.90       912\n",
      "        WP$       0.92      0.83      0.87      1354\n",
      "         RB       0.82      0.73      0.77      1103\n",
      "        VBN       0.99      0.99      0.99      1177\n",
      "        RBS       0.92      0.86      0.89      1269\n",
      "       PRP$       0.86      0.81      0.83      2962\n",
      "         IN       0.89      0.84      0.86      3034\n",
      "         CC       0.79      0.91      0.85      4803\n",
      "          #       0.99      0.99      0.99      2389\n",
      "        NNS       0.99      0.98      0.99      1214\n",
      "         EX       0.95      0.97      0.96       433\n",
      "         ``       0.99      0.99      0.99      1974\n",
      "         TO       0.85      0.83      0.84       539\n",
      "          )       0.78      0.69      0.73       727\n",
      "         NN       0.97      0.97      0.97       421\n",
      "         JJ       0.92      0.91      0.92      1918\n",
      "       NNPS       0.98      0.98      0.98       323\n",
      "          .       0.99      0.97      0.98       316\n",
      "         UH       0.86      0.91      0.89      1679\n",
      "         CD       1.00      0.90      0.95        48\n",
      "        SYM       0.98      0.96      0.97       470\n",
      "        RBR       0.86      0.55      0.67        11\n",
      "          (       1.00      0.90      0.95        77\n",
      "          $       0.99      0.98      0.98       384\n",
      "         VB       0.95      0.90      0.92        77\n",
      "         RP       0.85      0.26      0.40       130\n",
      "        VBP       0.99      0.98      0.98       814\n",
      "         MD       0.86      0.74      0.80        77\n",
      "         ''       0.95      0.91      0.93       110\n",
      "        VBG       0.87      0.56      0.68        70\n",
      "          ,       0.84      0.82      0.83       202\n",
      "        VBD       0.97      0.83      0.89       202\n",
      "        PDT       0.97      0.94      0.95        93\n",
      "        WDT       1.00      0.92      0.96        49\n",
      "        JJR       0.36      0.40      0.38        10\n",
      "          :       0.22      0.17      0.19        12\n",
      "      <pad>       0.97      0.98      0.97       238\n",
      "         FW       0.00      0.00      0.00         4\n",
      "        PRP       1.00      0.50      0.67         4\n",
      "        JJS       0.00      0.00      0.00         2\n",
      "\n",
      "avg / total       0.91      0.91      0.91     47360\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py:1428: UserWarning: labels size, 43, does not match size of target_names, 45\n",
      "  .format(len(labels), len(target_names))\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "Y_pred = sess.run(model.crf_decode,feed_dict={model.X:X_test})\n",
    "print(classification_report(Y_test.ravel(), Y_pred.ravel(), target_names=tag2idx.keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
